1 Required R libraries

if (!require("pacman")) install.packages("pacman")
## Indlæser krævet pakke: pacman
pacman::p_load("edgeR")
pacman::p_load("readr")
pacman::p_load("readxl")
pacman::p_load("biomaRt")
pacman::p_load("magrittr")
pacman::p_load("tibble")
pacman::p_load("stringr")
pacman::p_load("ggplot2")
pacman::p_load("data.table")
pacman::p_load("ggplot2", "patchwork")
pacman::p_load("openxlsx")
pacman::p_load("dplyr")
pacman::p_load("data.table")
pacman::p_load("ggvenn")
pacman::p_load("pheatmap")
pacman::p_load_gh("altintasali/aamisc")
library(ggrepel)

#Optional: Install archived R packages if necessary.
# This section is commented out, as these dependencies might not be needed in the final analysis.
# pacman::p_load("qvalue", "rain", "limma", "devtools")
# url <- "https://cran.r-project.org/src/contrib/Archive/HarmonicRegression/HarmonicRegression_1.0.tar.gz"
# pkgFile <- "HarmonicRegression_1.0.tar.gz"
# download.file(url = url, destfile = pkgFile)
# install.packages(pkgs=pkgFile, type="source", repos=NULL)
# file.remove(pkgFile)

# Define color palette for visualizations
# These colors represent conditions like AF, Metformin, and Sham.
publication_colors <- c("AF" = "#285291", "Metformin" = "#7C1516", "Sham" = "#9B9B9B")

2 Read data

2.1 Count matrix and metadata

# Load raw RNA-seq count data from the specified file path.
count_file <- "../../../../RNA-seq/data/count/gene-expression-all-reverse-stranded-countReadPairs.tsv"
count <- readr::read_delim(count_file)
## Rows: 38849 Columns: 66
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (5): Geneid, Chr, Start, End, Strand
## dbl (61): Length, Dorado_LA, Dorado_RA, Im_A_Mets_Fan_LA, Im_A_Mets_Fan_RA, ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Remove specific horses from the analysis (natural AF cases).
# only horses relevant to the experimental design are included.
count <- as_tibble(count) 
count <- count %>%
  dplyr::select(-c(Dorado_LA, Dorado_RA, Im_A_Mets_Fan_LA, Im_A_Mets_Fan_RA, 
            Jytte_LA, Jytte_RA, Kevin_Cook_LA, Kevin_Cook_RA, 
            San_Diego_LA, San_Diego_RA, Styles_LA, Styles_RA))



# Load metadata associated with the samples from an Excel file.
meta_file <- "../../../../RNA-seq/data/metadata/meta.xlsx" 
meta <- readxl::read_excel(meta_file)

# Load and process gene annotation data.
# `geneinfo_file` contains information about gene identifiers, descriptions, and positions.
geneinfo_file <- "../../../../RNA-seq/data/gene_annotation/horse_gene_annotation.tsv.gz"
geneinfo <- fread(geneinfo_file)

# Set column names for the gene annotation data and remove unnecessary columns.
colnames(geneinfo)
##  [1] "Gene stable ID"                             
##  [2] "Gene stable ID version"                     
##  [3] "Gene description"                           
##  [4] "Chromosome/scaffold name"                   
##  [5] "Gene start (bp)"                            
##  [6] "Gene end (bp)"                              
##  [7] "Strand"                                     
##  [8] "Gene name"                                  
##  [9] "NCBI gene (formerly Entrezgene) ID"         
## [10] "NCBI gene (formerly Entrezgene) description"
setnames(geneinfo, new = c("ENSEMBL", "ENSEMBLv", "Description_detailed", 
                           "Chr", "Start", "End", "Strand", "GENENAME", "ENTREZID", "Description"))
geneinfo <- geneinfo[, c("ENSEMBLv", "Description_detailed") := NULL]
geneinfo <- geneinfo[!duplicated(ENSEMBL), ]

# Merge count data with gene annotation information.
# This ensures that the count matrix has relevant gene annotations for downstream analysis.
annot <- merge(x = count[,c("Geneid", "Length")], 
               y = geneinfo, 
               by.x = "Geneid",
               by.y = "ENSEMBL", 
               all.x = TRUE, 
               all.y = FALSE)
setnames(annot, old = "Geneid", new = "ENSEMBL")
annot <- data.frame(annot)
rownames(annot) <- annot$ENSEMBL

# Write the cleaned and merged annotation file for future reference.
fwrite(x = annot, 
       file = "../../../../RNA-seq/data/gene_annotation/horse_gene_annotation_filtered.tsv.gz", 
       sep = "\t")

# Clean up the metadata file and ensure correct row names.
meta <- meta %>% remove_rownames %>% column_to_rownames(var="Sample_ID")
head(meta)
##        Horse Region     Group    Condition
## M1_LA     M1     LA Metformin LA_Metformin
## M1_RA     M1     RA Metformin RA_Metformin
## M10_LA   M10     LA        AF        LA_AF
## M10_RA   M10     RA        AF        RA_AF
## M11_LA   M11     LA Metformin LA_Metformin
## M11_RA   M11     RA Metformin RA_Metformin
# Clean count matrix to remove unnecessary columns and set row names.
count <- count[, -c(2:6)]   # Remove columns not relevant for differential expression analysis
count <- count %>% remove_rownames %>% column_to_rownames(var="Geneid")
head(count)
##                    M10_LA M10_RA M11_LA M11_RA M12_LA M12_RA M13_LA M13_RA
## ENSECAG00000013298    115    171    173    180    198    196    130    133
## ENSECAG00000019402      0      0      0      0      0      0      0      0
## ENSECAG00000052423    788   1131   1085    796    914   1091   1323    912
## ENSECAG00000011978      8      7      2     11     16      9      6     14
## ENSECAG00000005166    174    358    184    343    204    309    323    252
## ENSECAG00000032916    406    567    434    416    403    594    548    426
##                    M14_LA M14_RA M15_LA M15_RA M16_LA M16_RA M17_LA M17_RA
## ENSECAG00000013298    195    184    147    164    192    186    171    159
## ENSECAG00000019402      0      0      0      0      0      0      0      0
## ENSECAG00000052423    817    949    966   1328   1009   1193   1310    932
## ENSECAG00000011978     10     13      4     15      7     13      5     10
## ENSECAG00000005166    278    193    256    237    259    314    303    256
## ENSECAG00000032916    355    520    362    733    435    717    485    433
##                    M18_LA M18_RA M19_LA M19_RA M1_LA M1_RA M20_LA M20_RA M22_LA
## ENSECAG00000013298    122    188    148    155   179   197    304    172    131
## ENSECAG00000019402      0      0      0      0     0     0      0      0      0
## ENSECAG00000052423   1037   1185   1187    838   827   687   1900   1466    843
## ENSECAG00000011978      7     17      6     11    13    10      4      9      7
## ENSECAG00000005166    305    279    304    255   266   264    410    189    252
## ENSECAG00000032916    450    455    448    378   431   314    999   1021    431
##                    M22_RA M23_LA M23_RA M24_LA M24_RA M25_LA M25_RA M2_LA M2_RA
## ENSECAG00000013298    175    185    173    201    182    183    171   161   156
## ENSECAG00000019402      0      0      0      0      0      0      0     0     0
## ENSECAG00000052423    957    933    921   1012    826   1152    953   959  1324
## ENSECAG00000011978     11      8      6      4      4      5      9     8    12
## ENSECAG00000005166    250    258    263    250    247    276    244   264   247
## ENSECAG00000032916    569    420    509    427    447    451    555   542   694
##                    M3_LA M3_RA M4_LA M4_RA M5_LA M5_RA M6_LA M6_RA M7_LA M7_RA
## ENSECAG00000013298   133   198   134   201   131   182   134   171   130   162
## ENSECAG00000019402     0     0     0     0     0     0     0     0     0     0
## ENSECAG00000052423   670   818   726   980   908   761   839   864   926  1496
## ENSECAG00000011978     6    13     4     6     8     5     6     5    10     9
## ENSECAG00000005166   161   269   198   281   175   240   300   165   287   243
## ENSECAG00000032916   324   402   280   468   527   460   470   494   427   942
##                    M8_LA M8_RA M9_LA M9_RA
## ENSECAG00000013298   200   188   157   210
## ENSECAG00000019402     0     0     0     0
## ENSECAG00000052423   833   788   856  1004
## ENSECAG00000011978    12    20    13    18
## ENSECAG00000005166   246   213   182   241
## ENSECAG00000032916   460   526   343   426

3 edgeR differential expression analysis for multilevel designs

3.1 Read count matrix

# Remove version numbers from gene names if present (e.g., "Gene.1" -> "Gene").
rownames(count) <- stringr::str_split_fixed(string = rownames(count), 
                                        pattern = '[.]',
                                        n = 2)[,1]

# Reorder metadata and annotation tables to match the column order of the count matrix.
column_order <- names(count)
meta_reordered <- meta[column_order, , drop = FALSE]

annot_order <- rownames(count)
annot_reordered <- annot[annot_order, ]

# Create a DGEList object for `edgeR` analysis.
# The DGEList object contains the count matrix, gene annotation, and sample metadata.
d <- DGEList(counts = count, genes = annot_reordered, samples = meta_reordered)

3.2 Filtering

# Filter out genes with low expression across all samples.
keep <- filterByExpr(d)
table(keep)  # Display the number of genes retained after filtering.
## keep
## FALSE  TRUE 
## 26832 12017
# Subset the DGEList object to keep only the filtered genes.
y <- d[keep, , keep.lib.sizes=FALSE]

3.3 Normalization

# Calculate normalization factors to adjust for differences in library sizes between samples.
y <- calcNormFactors(y)

# Write normalized counts to a file for multi-omics integration.
norm_counts <- cpm(y, normalized.lib.sizes=TRUE)
# Uncomment to save normalized counts for further analysis.
# write.table(norm_counts, file="../../../../RNA-seq/analysis/01_dge/output/rna_seq_data.txt", sep="\t", col.names=NA, quote=FALSE)

# Normalized counts with gene names included for easier multi-omics integration in OmicsAnalyst.
norm_counts_genenames <- as.data.frame(norm_counts)
norm_counts_genenames$ENSEMBL <- rownames(norm_counts_genenames)
norm_counts_genenames <- merge(norm_counts_genenames, annot[, c("ENSEMBL", "GENENAME")], by = "ENSEMBL")

# Use gene names when available, otherwise fallback to ENSEMBL IDs.
norm_counts_genenames$ID <- ifelse(is.na(norm_counts_genenames$GENENAME) | norm_counts_genenames$GENENAME == "", 
                                   norm_counts_genenames$ENSEMBL, 
                                   norm_counts_genenames$GENENAME)

# Ensure unique identifiers by appending ENSEMBL ID for duplicate gene names.
duplicates <- norm_counts_genenames$ID[duplicated(norm_counts_genenames$ID)]
norm_counts_genenames$ID <- ifelse(norm_counts_genenames$ID %in% duplicates, 
                                   paste(norm_counts_genenames$ID, norm_counts_genenames$ENSEMBL, sep = "_"), 
                                   norm_counts_genenames$ID)
rownames(norm_counts_genenames) <- norm_counts_genenames$ID

# Remove unnecessary columns for the final output.
norm_counts_genenames <- norm_counts_genenames[, !colnames(norm_counts_genenames) %in% c("ENSEMBL", "GENENAME", "ID")]

# Uncomment to save the final normalized counts with gene names.
# write.table(norm_counts_genenames, file="../../../../RNA-seq/analysis/01_dge/output/rna_seq_data_genenames.txt", sep="\t", col.names=NA, quote=FALSE)

# Create design matrix for linear modeling in edgeR
# Condition refers to different experimental conditions (e.g., Treatment, Control)
design <- model.matrix(~0 + Condition , y$samples)
colnames(design) <- gsub("Condition", "", colnames(design))

# Estimate dispersion values to model the variance of expression.
y <- estimateDisp(y, design)

# Calculate normalized expression levels (CPM - Counts Per Million)
CPM <- cpm(y)
logCPM <- cpm(y, log = TRUE)

3.3.1 Count distributions

# Transform the logCPM matrix into long format for ggplot2 visualization.
# This will allow each gene/sample pair to be plotted individually.
logCPM_melted <- data.table::melt(logCPM)
## Warning in data.table::melt(logCPM): The melt generic in data.table has been
## passed a matrix and will attempt to redirect to the relevant reshape2 method;
## please note that reshape2 is deprecated, and this redirection is now deprecated
## as well. To continue using melt methods from reshape2 while both libraries are
## attached, e.g. melt.list, you can prepend the namespace like
## reshape2::melt(logCPM). In the next version, this warning will become an error.
# Rename columns for better readability and consistency.
# "Var1" and "Var2" refer to the dimensions of the matrix and are renamed to "Gene" and "Sample".
setnames(x = logCPM_melted, 
         old = c("Var1", "Var2", "value"),
         new = c("Gene", "Sample", "logCPM"))

# Add experimental conditions like "Group" or "Region" to each sample for coloring in the plots.
logCPM_melted <- data.table::merge.data.table(x = logCPM_melted, 
                                              y = data.table(Sample = rownames(meta), meta), 
                                              by = "Sample")

# Color the boxes based on the experimental condition to highlight potential differences.
ggplot(logCPM_melted, aes(x = Sample, y = logCPM)) + 
  geom_boxplot(aes(color = Condition)) +   
  theme_bw() +                             
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +  
  ggtitle("logCPM boxplots")             

# Create density plots of logCPM values for each sample.
# This visualization helps to assess the distribution of expression values and identify batch effects.
ggplot(logCPM_melted, aes(x = logCPM)) + 
  geom_density(aes(group = Sample, color = Condition)) +  
  theme_bw() +                                           
  ggtitle("logCPM density distributions")                

3.4 Dimension reduction

3.4.1 MDS plot

# Create a Multi-dimensional Scaling (MDS) plot for the RNA-seq data.
# This plot visualizes the similarity between samples based on expression values, 
# with similar samples clustering together.
mds <- plotMDS(y, plot = FALSE)

dims <- list(p1 = c(1,2), p2 = c(1,3), p3 = c(2,3))
mds_plot <- list()

# without labels
for (i in seq_along(dims)){
  mds_plot[[i]] <- aamisc::ggMDS(mds = mds,
                                 meta = meta, 
                                 dim = dims[[i]], 
                                 color.by = "Group", 
                                 shape.by = "Region",
                                 legend.position = "right"
                                 # text.by = "Horse",
                                 # text.size = 1.5
                                 )
}


patchwork::wrap_plots(mds_plot, ncol = 2) + patchwork::plot_layout(guides = 'collect')

# Create MDS plots with sample labels for detailed view
for (i in seq_along(dims)){
  mds_plot[[i]] <- aamisc::ggMDS(mds = mds,
                                 meta = meta, 
                                 dim = dims[[i]], 
                                 color.by = "Group", 
                                 shape.by = "Region",
                                 legend.position = "right",
                                 text.by = "Horse",
                                 text.size = 1.5
                                 )
}

patchwork::wrap_plots(mds_plot, ncol = 2) + patchwork::plot_layout(guides = 'collect')

3.4.2 MDS plot (batch “horse ID” removed)

# Remove batch effects using the "removeBatchEffect" function.
# The "batch" parameter specifies the factor to remove (Horse ID in this case).
# This helps to isolate the effects of the experimental conditions by eliminating variability introduced by horses contributing with two samples each (LA and RA).
logCPM_batchRemoved <- removeBatchEffect(logCPM, 
                                         batch = as.factor(y$samples$Horse), 
                                         design = design)
## Coefficients not estimable: batch22 batch23
## Warning: Partial NA coefficients for 12017 probe(s)
mds <- plotMDS(logCPM_batchRemoved, plot = FALSE)

dims <- list(p1 = c(1,2), p2 = c(1,3), p3 = c(2,3))
mds_plot <- list()
meta$Group <- factor(meta$Group, levels = names(publication_colors))

# without labels
for (i in seq_along(dims)){
  mds_plot[[i]] <- aamisc::ggMDS(mds = mds,
                                 meta = meta, 
                                 dim = dims[[i]], 
                                 color.by = "Group", 
                                 shape.by = "Region",
                                 legend.position = "right"
                                 # text.by = "Horse",
                                 # text.size = 1.5
                                 ) + 
                  scale_color_manual(values = publication_colors)
}

patchwork::wrap_plots(mds_plot, ncol = 2) + patchwork::plot_layout(guides = 'collect')

# with labels
for (i in seq_along(dims)){
  mds_plot[[i]] <- aamisc::ggMDS(mds = mds,
                                 meta = meta, 
                                 dim = dims[[i]], 
                                 color.by = "Group", 
                                 shape.by = "Region",
                                 legend.position = "right",
                                 text.by = "Horse",
                                 text.size = 1.5
                                 ) + 
                  scale_color_manual(values = publication_colors)
}
mds_plot[1]
## [[1]]

patchwork::wrap_plots(mds_plot, ncol = 2) + patchwork::plot_layout(guides = 'collect')

4 Differential expression analysis

4.1 Create contrast for multilevel design

# Define contrasts to test specific hypotheses using the design matrix.
colnames(design)  # Display the names of the coefficients in the design matrix to verify the structure.
## [1] "LA_AF"        "LA_Metformin" "LA_Sham"      "RA_AF"        "RA_Metformin"
## [6] "RA_Sham"
# Create a list of contrasts based on experimental design.
con <- makeContrasts(
  # Comparisons of metformin treatment vs. placebo in both left (LA) and right atria (RA).
  met_vs_placebo_RA = RA_Metformin - RA_AF,              # Metformin vs. Placebo in RA
  met_vs_placebo_LA = LA_Metformin - LA_AF,              # Metformin vs. Placebo in LA
  
  # Comparison of regional differences (RA vs. LA) under placebo and metformin treatment.
  RA_vs_LA_placebo = RA_AF - LA_AF,                      # RA vs. LA under Placebo
  RA_vs_LA_met = RA_Metformin - LA_Metformin,            # RA vs. LA under Metformin
  
  # Average treatment effect across both atria.
  AverageTreatmentEffect = (LA_Metformin + RA_Metformin)/2 - (LA_AF + RA_AF)/2,

  # Interaction term: testing if the treatment effect differs between RA and LA.
  met.vs.placebo_vs_RA.LA = (RA_Metformin - RA_AF) - (LA_Metformin - LA_AF),
  
  # Comparisons of AF vs. Sham (control) in both left (LA) and right atria (RA).
  AF_vs_sham_RA = RA_AF - RA_Sham,                       # AF vs. Sham in RA
  AF_vs_sham_LA = LA_AF - LA_Sham,                       # AF vs. Sham in LA
  
  # Comparison of regional differences (RA vs. LA) under Sham and AF conditions.
  RA_vs_LA_sham = RA_Sham - LA_Sham,                     # RA vs. LA under Sham
  RA_vs_LA_AF = RA_AF - LA_AF,                           # RA vs. LA under AF
  
  # Average disease effect across both atria.
  AverageDiseaseEffect = (LA_AF + RA_AF)/2 - (LA_Sham + RA_Sham)/2,
  
  # Interaction term: testing if the disease effect differs between RA and LA.
  placebo.vs.sham_vs_RA.LA = (RA_AF - RA_Sham) - (LA_AF - LA_Sham),
  
  levels = design  # Specify the levels of the design matrix.
)

con  # Output the contrast matrix to verify that contrasts are correctly specified.
##               Contrasts
## Levels         met_vs_placebo_RA met_vs_placebo_LA RA_vs_LA_placebo
##   LA_AF                        0                -1               -1
##   LA_Metformin                 0                 1                0
##   LA_Sham                      0                 0                0
##   RA_AF                       -1                 0                1
##   RA_Metformin                 1                 0                0
##   RA_Sham                      0                 0                0
##               Contrasts
## Levels         RA_vs_LA_met AverageTreatmentEffect met.vs.placebo_vs_RA.LA
##   LA_AF                   0                   -0.5                       1
##   LA_Metformin           -1                    0.5                      -1
##   LA_Sham                 0                    0.0                       0
##   RA_AF                   0                   -0.5                      -1
##   RA_Metformin            1                    0.5                       1
##   RA_Sham                 0                    0.0                       0
##               Contrasts
## Levels         AF_vs_sham_RA AF_vs_sham_LA RA_vs_LA_sham RA_vs_LA_AF
##   LA_AF                    0             1             0          -1
##   LA_Metformin             0             0             0           0
##   LA_Sham                  0            -1            -1           0
##   RA_AF                    1             0             0           1
##   RA_Metformin             0             0             0           0
##   RA_Sham                 -1             0             1           0
##               Contrasts
## Levels         AverageDiseaseEffect placebo.vs.sham_vs_RA.LA
##   LA_AF                         0.5                       -1
##   LA_Metformin                  0.0                        0
##   LA_Sham                      -0.5                        1
##   RA_AF                         0.5                        1
##   RA_Metformin                  0.0                        0
##   RA_Sham                      -0.5                       -1
# Explanation: The "AverageDiseaseEffect" is not the same as the main disease effect, but we will leave that as we are only interested in the region-wise comparisons.

4.2 Differential analysis

# Perform differential expression analysis using `voomLmFit` function.
# This approach is suitable for RNA-seq data with complex experimental designs, such as multilevel/paired design models.
# The `block` argument specifies a blocking factor (Horse ID)

y_raw <- d[keep, ,keep.lib.sizes=FALSE]  # keep only filtered genes.

# Fit a linear model using the `voomLmFit` function with voom transformation.
# `plot = TRUE` will display the mean-variance trend, a diagnostic plot for voom transformation.
v <- voomLmFit(counts = y_raw, 
               design = design, 
               block = as.factor(y_raw$samples$Horse),  # Block by Horse ID to account for repeated measures/paired design (two atrial samples per horse).
               sample.weights = TRUE, 
               plot = TRUE)
## First sample weights (min/max) 0.2037813/1.8751727
## First intra-block correlation  0.2684471
## Final sample weights (min/max) 0.2035692/1.8656543
## Final intra-block correlation  0.2680694

# Initialize an empty list to store differential expression results for each contrast.
res <- list()  

# Loop through each contrast and perform differential analysis.
for (i in colnames(con)) {
  # Fit the contrast matrix to the linear model.
  fit <- contrasts.fit(v, contrasts = con)
  
  # Apply empirical Bayes moderation to the standard errors to improve statistical power.
  fit <- eBayes(fit, robust = TRUE)
  
  # Extract the top differentially expressed genes for each contrast.
  res[[i]] <- topTable(fit, coef = i, number = Inf)
  
  # Add the contrast name as a new column in the result table for easier identification.
  res[[i]] <- data.frame(res[[i]], Contrast = i)
  
  # Print the number of differentially expressed (DE) genes with adjusted p-value < 0.05.
  n <- res[[i]] %>% dplyr::filter(adj.P.Val < 0.05) %>% nrow 
  print(paste('Number of DE genes for', i, '=', n))
}
## [1] "Number of DE genes for met_vs_placebo_RA = 4428"
## [1] "Number of DE genes for met_vs_placebo_LA = 0"
## [1] "Number of DE genes for RA_vs_LA_placebo = 5174"
## [1] "Number of DE genes for RA_vs_LA_met = 1684"
## [1] "Number of DE genes for AverageTreatmentEffect = 878"
## [1] "Number of DE genes for met.vs.placebo_vs_RA.LA = 4043"
## [1] "Number of DE genes for AF_vs_sham_RA = 1950"
## [1] "Number of DE genes for AF_vs_sham_LA = 643"
## [1] "Number of DE genes for RA_vs_LA_sham = 761"
## [1] "Number of DE genes for RA_vs_LA_AF = 5174"
## [1] "Number of DE genes for AverageDiseaseEffect = 1975"
## [1] "Number of DE genes for placebo.vs.sham_vs_RA.LA = 19"
# Combine all contrast results into a single data frame for easier output and visualization.
res_all <- do.call(rbind, res)

# Create an output Excel file with results for each contrast.
# Uncomment to save
# openxlsx::write.xlsx(x = res, file = "../../../../RNA-seq/analysis/01_dge/output/dge_results.xlsx", asTable = TRUE)

# Create an output TSV file with the combined results for all contrasts.
# Uncomment to save
#data.table::fwrite(x = res_all, file = "../../../../RNA-seq/analysis/01_dge/output/dge_results.tsv.gz", sep = "\t")

4.3 Diagnostics for differential analysis

4.3.1 p-value histograms

# Create p-value histograms for each contrast.
# This visualization helps assess the distribution of p-values and identify potential issues
ggplot(res_all, aes(x = P.Value)) + 
  geom_histogram(fill = "lightgray",       
                 color = "black",          
                 breaks = seq(0, 1, by = 0.05), 
                 closed = "right",         
                 lwd = 0.2) +              
  facet_wrap(~ Contrast, nrow = 3, scales = "free") +  
  theme_bw()  

4.3.2 Volcano plots

volcano_plots <- list()
for (i in names(res)){
  volcano_plots[[i]] <- ggVolcano(x = res[[i]], 
                                  fdr = 0.05,
                                  fdr.column = "adj.P.Val", 
                                  pvalue.column = "P.Value", 
                                  logFC = 0, 
                                  logFC.column = "logFC", 
                                  text.size = 2) + 
    theme_bw(base_size = 10) + 
    ggtitle(i)
}
## Warning in max(x[get(fdr.column) <= .fdr][, get(pvalue.column)]): no
## non-missing arguments to max; returning -Inf
# Combine all volcano plots into a single layout with 3 columns.
patchwork::wrap_plots(volcano_plots, ncol = 3)

# Print individual volcano plots for key contrasts.
print(volcano_plots[["AF_vs_sham_RA"]])

print(volcano_plots[["AF_vs_sham_LA"]])

print(volcano_plots[["met_vs_placebo_RA"]])

print(volcano_plots[["met_vs_placebo_LA"]])

# Create a combined plot of the key contrasts for easier comparison.
combined_plot <- ((volcano_plots[["AF_vs_sham_RA"]]) | (volcano_plots[["AF_vs_sham_LA"]])) / ((volcano_plots[["met_vs_placebo_RA"]]) | (volcano_plots[["met_vs_placebo_LA"]]))
print(combined_plot)

### Volcano Plot Custom

regions <- unique(meta$Region)

# Gene Name Mapping for Volcano Plots in RNA-seq Analysis
# Check if the annotation dataframe has the necessary columns
if (!"ENSEMBL" %in% colnames(annot_reordered) || !"GENENAME" %in% colnames(annot_reordered)) {
  stop("Error: The annotation dataframe must contain both 'ENSEMBL' and 'GENENAME' columns.")
}
# Create a named vector for ENSEMBL to GENENAME mapping
ensembl_to_genename <- setNames(annot_reordered$GENENAME, annot_reordered$ENSEMBL)
# Map ENSEMBL IDs to GeneNames in the res list
for (contrast_name in names(res)) {
  # Ensure the dataframe has ENSEMBL IDs as rownames
  if (!"ENSEMBL" %in% colnames(res[[contrast_name]])) {
    res[[contrast_name]]$ENSEMBL <- rownames(res[[contrast_name]])
  }
  
  # Map GENENAMEs to the dataframe using ENSEMBL IDs
  res[[contrast_name]]$GENENAME <- ensembl_to_genename[res[[contrast_name]]$ENSEMBL]
  
  # Replace NA values in GENENAME with ENSEMBL IDs (to ensure plotting works even if some gene names are missing)
  res[[contrast_name]]$GENENAME[is.na(res[[contrast_name]]$GENENAME)] <- res[[contrast_name]]$ENSEMBL[is.na(res[[contrast_name]]$GENENAME)]
}


# Source the helper function
source("volcano_helpers.R")

# Create lists to store both versions of volcano plots
volcano_plots_no_labels <- list()
volcano_plots_with_labels <- list()

# Iterate over each contrast in `res` and create custom volcano plots with/without labels
for (contrast_name in names(res)) {
  # Ensure the GeneName column is present in the dataframe for labeling
  if (!"GeneName" %in% colnames(res[[contrast_name]])) {
    # Map ENSEMBL to GeneName using the preprocessed mapping vector
    res[[contrast_name]]$GeneName <- sapply(rownames(res[[contrast_name]]), function(x) gsub(".*_", "", x))
  }
  
  # Generate volcano plots with and without labels using the helper function
  volcano_plots <- create_custom_volcano_plot(
    df = res[[contrast_name]],
    logFC_col = "logFC",
    pvalue_col = "P.Value",
    adj_pvalue_col = "adj.P.Val",
    contrast_name = contrast_name,
    fc_cutoff = 0,
    pvalue_cutoff = 0.05,
    save_plot = TRUE, 
    output_path = "../output/",
    show_labels = TRUE  # Always generate both labeled and unlabeled versions
  )
  
  # Store the plots in separate lists
  volcano_plots_no_labels[[contrast_name]] <- volcano_plots$No_Labels
  volcano_plots_with_labels[[contrast_name]] <- volcano_plots$With_Labels
}

# Combine and display the volcano plots without labels
patchwork::wrap_plots(volcano_plots_no_labels, ncol = 3)

# Specify the output directory
output_dir <- "../output/volcano_plots"

# Ensure the directory exists
if (!dir.exists(output_dir)) {
  dir.create(output_dir, recursive = TRUE)
}

# List of contrasts to save
contrasts_to_save <- c("AF_vs_sham_RA", "AF_vs_sham_LA", "met_vs_placebo_RA", "met_vs_placebo_LA")

# Loop over each specified contrast and save both labeled and unlabeled plots
for (contrast_name in contrasts_to_save) {
  # Print the plots to console (optional, useful for debugging)
  print(volcano_plots_with_labels[[contrast_name]])
  print(volcano_plots_no_labels[[contrast_name]])
  
  # Save the plot with labels
  ggsave(
    filename = file.path(output_dir, paste0("volcano_plot_", contrast_name, "_With_Labels.png")),
    plot = volcano_plots_with_labels[[contrast_name]],
    width = 6,
    height = 6,
    dpi = 600
  )
  
  # Save the plot without labels
  ggsave(
    filename = file.path(output_dir, paste0("volcano_plot_", contrast_name, "_No_Labels.png")),
    plot = volcano_plots_no_labels[[contrast_name]],
    width = 6,
    height = 6,
    dpi = 600
  )
}
## Warning: ggrepel: 1579 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

## Warning: ggrepel: 1690 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

## Warning: ggrepel: 372 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

## Warning: ggrepel: 485 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

## Warning: ggrepel: 3781 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

## Warning: ggrepel: 3948 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

# Explorative Data Analysis ## Heatmap Generator

## Heatmap Generator using Non-Batch Corrected Values with Error Handling
generate_pheatmap_for_goid_non_batch <- function(go_file, annot, logCPM, res_all, meta, meta_reordered, goid, contrast1, contrast2, region1, region2) {
  # Get GO annotation
  go <- fread(go_file)
  go <- go[, c("Gene stable ID", "GO term accession", "GO term name", "GO domain")]
  setnames(go, new = c("ENSEMBL", "GOID", "Description", "GOdomain"))
  go <- go[GOdomain != "", ]
  
  # Filter for specified GO term
  go_activity <- go[GOID == goid, "ENSEMBL"]
  go_ensembl <- annot[annot$ENSEMBL %in% go_activity$ENSEMBL, c("GENENAME", "ENSEMBL")]
  subset_logCPM <- logCPM[rownames(logCPM) %in% go_ensembl$ENSEMBL, ]
  rownames(subset_logCPM) <- go_ensembl$GENENAME[match(rownames(subset_logCPM), go_ensembl$ENSEMBL)]
  
  # Significant genes
  sig_genes <- res_all[res_all$ENSEMBL %in% go_ensembl$ENSEMBL & res_all$adj.P.Val < 0.05, ]
  sig_contrast1_genes <- sig_genes[sig_genes$Contrast == contrast1, "GENENAME"]
  sig_contrast2_genes <- sig_genes[sig_genes$Contrast == contrast2, "GENENAME"]
  
  # Check if there are any significant genes for contrast1 and region1
  if (length(sig_contrast1_genes) > 0) {
    # Generate heatmap for the first region using non-batch corrected values
    region1_subset_logCPM <- subset_logCPM[, row.names(meta)[meta$Region == region1]]
    region1_subset_logCPM <- region1_subset_logCPM[row.names(region1_subset_logCPM) %in% sig_contrast1_genes, ]
    if (nrow(region1_subset_logCPM) > 0) {
      pheatmap(region1_subset_logCPM,
               annotation_col = meta_reordered[, "Group", drop=FALSE],
               annotation_colors = list(Group = publication_colors),
               scale = "row",
               show_rownames = TRUE,
               fontsize = 10,
               fontsize_row = 8,
               fontsize_col = 8,
               color = colorRampPalette(viridisLite::magma(50))(50),
               main = paste("Non-Batch Corrected Heatmap for", goid, "in", region1))
    } else {
      print(paste("No significant genes to display for contrast:", contrast1, "in region:", region1))
    }
  } else {
    print(paste("No significant genes found for contrast:", contrast1, "in region:", region1))
  }
  
  # Check if there are any significant genes for contrast2 and region2
  if (length(sig_contrast2_genes) > 0) {
    # Generate heatmap for the second region using non-batch corrected values
    region2_subset_logCPM <- subset_logCPM[, row.names(meta)[meta$Region == region2]]
    region2_subset_logCPM <- region2_subset_logCPM[row.names(region2_subset_logCPM) %in% sig_contrast2_genes, ]
    if (nrow(region2_subset_logCPM) > 0) {
      pheatmap(region2_subset_logCPM,
               annotation_col = meta_reordered[, "Group", drop=FALSE],
               annotation_colors = list(Group = publication_colors),
               scale = "row",
               show_rownames = TRUE,
               fontsize = 10,
               fontsize_row = 8,
               fontsize_col = 8,
               color = colorRampPalette(viridisLite::viridis(50))(50),
               main = paste("Non-Batch Corrected Heatmap for", goid, "in", region2))
    } else {
      print(paste("No significant genes to display for contrast:", contrast2, "in region:", region2))
    }
  } else {
    print(paste("No significant genes found for contrast:", contrast2, "in region:", region2))
  }
}

# Example usage with non-batch corrected logCPM values
generate_pheatmap_for_goid_non_batch(
  go_file = "../../../data/gene_annotation/horse_GO.tsv.gz",
  annot = annot,
  logCPM = logCPM,                 # Using non-batch corrected logCPM
  res_all = res_all,
  meta = meta,
  meta_reordered = meta_reordered,
  goid = "GO:0004672",             # Example GOID for protein kinase activity
  contrast1 = "AF_vs_sham_RA",     # Example Contrast 1
  contrast2 = "AF_vs_sham_LA",     # Example Contrast 2
  region1 = "RA",                  # Example Region 1
  region2 = "LA"                   # Example Region 2
)

# Ion-Channel Compex: GO:0034702 with non-batch corrected values
generate_pheatmap_for_goid_non_batch(
  go_file = "../../../data/gene_annotation/horse_GO.tsv.gz",
  annot = annot,
  logCPM = logCPM,                 # Using non-batch corrected logCPM
  res_all = res_all,
  meta = meta,
  meta_reordered = meta_reordered,
  goid = "GO:0034702",             # Ion-channel Complex
  contrast1 = "met_vs_placebo_RA",     
  contrast2 = "met_vs_placebo_LA",     
  region1 = "RA",                  
  region2 = "LA"                   
)

## [1] "No significant genes found for contrast: met_vs_placebo_LA in region: LA"
## Ionchannel Heatmap used in Main Figure

#Example for mitochondrial matrix GO:0005759
generate_pheatmap_for_goid_non_batch(
  go_file = "../../../data/gene_annotation/horse_GO.tsv.gz",
  annot = annot,
  logCPM = logCPM,                 # Using non-batch corrected logCPM
  res_all = res_all,
  meta = meta,
  meta_reordered = meta_reordered,
  goid = "GO:0005759",          # Mitochondrial Matrix
  contrast1 = "AF_vs_sham_RA",  
  contrast2 = "AF_vs_sham_LA",  
  region1 = "RA",               
  region2 = "LA"                
)

## Mitochondrial Matrix Heatmap used in Supplemnetary Figure

# Generate the heatmap and capture it as a grob
heatmap_RA <- generate_pheatmap_for_goid_non_batch(
  go_file = "../../../data/gene_annotation/horse_GO.tsv.gz",
  annot = annot,
  logCPM = logCPM,                 
  res_all = res_all,
  meta = meta,
  meta_reordered = meta_reordered,
  goid = "GO:0034702",             
  contrast1 = "met_vs_placebo_RA",     
  contrast2 = "met_vs_placebo_LA",     
  region1 = "RA",                  
  region2 = "LA"                   
)

## [1] "No significant genes found for contrast: met_vs_placebo_LA in region: LA"